/*
program to mask the word structure information
*/

capture program drop mask_it
program mask_it
	args word
	tempfile TEMP1 TEMP2
	tempvar MATCH LENGTH
	gen sortorder=_n
	save `TEMP1', replace
	gen `MATCH'=1
	collapse (sum) `MATCH', by(`word') fast

	drop `MATCH'
	gen identifier=_n
		
	gen `LENGTH'=ustrlen(`word')
	expand `LENGTH'
	sort identifier
	
	bysort id: gen charid=_n
	
	gen char=usubstr(usubstr(`word',charid,charid+1),1,1)
	
	preserve
	gen `MATCH'=1
	collapse (sum) `MATCH', by(char) fast
	drop if ustrregexm(char,"[0-9]")
					
	gen random=runiform()
	so random
	
	gen char_shuffler=_n
	keep char char_shuffler
	ren char char_shuffled
	local char_sum=_N
	save `TEMP2', replace
	restore	
	
	gen char_shuffler=runiformint(1,`char_sum')
	merge m:m char_shuffler using `TEMP2'
	sort identifier charid
	replace char_shuffled=char if ustrregexm(char,"[0-9]") 
	
	sum `LENGTH'
	local to=r(max)
	forvalues c=`to'(-1)1 {
		if `c'==`to' {
		gen str char`c'=char_shuffled if charid==`c' 
	}
		else {
			local plusone=`c'+1
			gen char`c'=char_shuffled+char`plusone'[_n+1] if charid==`c'		
		}
		}
	
	keep if charid==1
	keep `word' char1
	
	/* one character words are not masked */
	replace char1=`word' if ustrlen(char1)==1
	
	/* test for duplicated masks */
	bysort char1: gen id=_n
	replace char1=ustrreverse(char1) if id>1
	drop id
	
	
	merge 1:m `word' using `TEMP1', nogenerate
	sort sortorder
	
	rename char1 `word'_masked
	drop sortorder
end
*********************************************************************
/* start preparation of files for each of N groups */
local dir `c(pwd)'
local group `1'

use bible_index_`group', clear

local till=_N
local counter=_N

forvalues i=1/`till' {
	use bible_index_`group', clear
	local translation=translation[`i']
	local location=ISO[`i']
	noisily di "`translation' (left: `counter') "
	qui {	
	/* per bible book */
	local counter=`counter'-1
	
	/* create macro containing each available book */
	
	local books "40 41 42 43 44 66"
	
	use "`dir'\\corpus\\`translation'.dta" , clear
		drop if sentence==""
		gen book=real(usubstr(vid,1,2))
		***shuffle verses
		set seed 081116
		gen random=runiform()
		sort random
		drop random
		/* 
 	Remark: word tokenization is based on the paralleltext.info procedure,
 	except for described exceptions 	*/
 	
 	
 	/* one word per line */		
 	if "`translation'"=="khm-x-bible-newworld"|"`translation'"=="cmn-x-bible-cuvmp-simplified"|"`translation'"=="cmn-x-bible-csb-simplified"|"`translation'"=="cmn-x-bible-newsimplified"| "`translation'"=="mya-x-bible-newworld"|"`translation'"=="khm-x-bible-2011"|"`translation'"=="khm-x-bible-standard2005"|"`translation'"=="ang-x-bible"|"`translation'"=="cmn-x-bible-newsimplified"|"`translation'"=="ksw-x-bible"|"`translation'"=="mya-x-bible-common"|"`translation'"=="mya-x-bible-1835" {
 	/*treat dash-separated words as one word */
 	replace sentence=usubinstr(sentence,"-","HYPHEN",.)
 	gen sentid=_n
 	gen sentencelength=ustrwordcount(sentence, "`location'")	
 	expand sentencelength
	sort sentid
	bysort sentid: gen wordid=_n
	gen word=ustrword(sentence,wordid, "`location'")
	replace word=usubinstr(word,"HYPHEN","-",.)
	}
	else {
	gen sentid=_n
	gen sentencelength=wordcount(sentence)	
 	expand sentencelength
	sort sentid
	bysort sentid: gen wordid=_n
	gen word=word(sentence,wordid)
	}
	save grouped_data_`group', replace
 	********NEW: remove all spaces and randomly insert new ones in a language specific way********
	 		
	foreach book of local books		{						

		use if book==`book' using grouped_data_`group', clear	
		set seed 261016
		gen random=runiform()
		/*no shuffle for original version*/
		*sort random
		replace word=word+" "
		gen identifier=_n
			
		gen length=ustrlen(word)
		expand length
		sort identifier
		
		bysort id: gen charid=_n
		
		gen char=usubstr(usubstr(word,charid,charid+1),1,1)
		keep char
		gen id=_n
		gen space=0
		replace space=1 if char==" "
		sort space id
		gen merger=_n
		tempfile spec_mask
		save `spec_mask', replace
		
		use if book==`book' using grouped_data_`group', clear
		gen identifier=_n
			
		gen length=ustrlen(word)
		expand length
		sort identifier
		
		bysort id: gen charid=_n
		
		gen char=usubstr(usubstr(word,charid,charid+1),1,1)
		keep char 
		gen merger=_n
		merge 1:1 merger using `spec_mask'
		
		sort id
		replace char="***SPACE***" if char==" "
		outsheet char using "text_`group'.csv", nonames noquote replace
		filefilter text_`group'.csv text2_`group'.csv, from(\r\n) to(`""') replace
		filefilter text2_`group'.csv `book'_`group'.csv, from("***SPACE***") to(" ") replace	
	}
	clear
	set obs 6
	gen str vid="40" 
	replace vid="41" in 2
	replace vid="42" in 3
	replace vid="43" in 4
	replace vid="44" in 5
	replace vid="66" in 6
	gen strL sentence=""
	foreach book in 40 41 42 43 44 66 { 
		replace sentence=fileread("`book'_`group'.csv") if vid=="`book'"
	}
	
	************
	gen sentid=_n
	gen sentencelength=wordcount(sentence)	
 	expand sentencelength
	sort sentid
	bysort sentid: gen wordid=_n
	gen word=word(sentence,wordid)
		
	compress	
	gen book=real(usubstr(vid,1,2))
	keep word sentid book
	drop if word==""|word==" "
	
	
	save grouped_data_`group', replace
	
	foreach book of local books		{						

				use if book==`book' using grouped_data_`group', clear	
				/* validation part: first destroy both word structure and word order by masking/scrambling
		then calculate relative entropy by masking/scrambling again. Theoretical expecation: both D_o & D_s should be roughly zero */
				*mask word structure
				mask_it word
				drop word
				rename word_masked word
				*destroy word order
				gen random=runiform()
				sort sentid random
				drop random	
				
		
****************************************************************************************************************
				/* mask word_structure */
				 	
				 	mask_it word
				 	rename word word_original
				 	
****************************************************************************************************************					
		
					foreach type in original structure order {
					preserve
					if "`type'" =="order" {
						/* whole string is shuffed this time, because verse structutrue ist destroyed */
					gen random=runiform()
					sort random
					}
					if "`type'" =="structure"  {
						keep word_masked
					}
					if  "`type'" =="original" | "`type'" =="order" {
						keep word_original
					}
										
					outsheet word using "text_`group'.txt", nonames noquote replace
					
					filefilter text_`group'.txt `"`dir'\length_validation_II_g\\`translation'_`book'_`type'.txt"', from(\r\n) to(`" "') replace
					restore
					}	
									}

							}
						}
		/* cleaning up */
		foreach file in 40_`group'.csv 41_`group'.csv 42_`group'.csv 43_`group'.csv 44_`group'.csv 66_`group'.csv text2_`group'.csv text_`group'.csv grouped_data_`group'.dta bible_index_`group'.dta raw_`group'.raw raw2_`group'.raw char_shuffler_`group'.dta zip_`group'.zip text_`group'.txt distorted_`group'.txt text2_`group'.txt raw1_`group'.raw raw_`group'.raw group_`group'.dta id_`group'.dta temp_`group'.dta bible_index_`group'.dta {
		capture erase `file'
	}

/* generate finished file */
clear
set obs 1
gen v=1
save finished_`group', replace

clear         
exit, STATA 	



